Load packages

library(data.table)   # For faster data manipulation
library(tidyverse)    # For data manipulation and visualization
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::between()     masks data.table::between()
## ✖ dplyr::filter()      masks stats::filter()
## ✖ dplyr::first()       masks data.table::first()
## ✖ lubridate::hour()    masks data.table::hour()
## ✖ lubridate::isoweek() masks data.table::isoweek()
## ✖ dplyr::lag()         masks stats::lag()
## ✖ dplyr::last()        masks data.table::last()
## ✖ lubridate::mday()    masks data.table::mday()
## ✖ lubridate::minute()  masks data.table::minute()
## ✖ lubridate::month()   masks data.table::month()
## ✖ lubridate::quarter() masks data.table::quarter()
## ✖ lubridate::second()  masks data.table::second()
## ✖ purrr::transpose()   masks data.table::transpose()
## ✖ lubridate::wday()    masks data.table::wday()
## ✖ lubridate::week()    masks data.table::week()
## ✖ lubridate::yday()    masks data.table::yday()
## ✖ lubridate::year()    masks data.table::year()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(sf)           # For spatial data handling
## Linking to GEOS 3.12.2, GDAL 3.9.3, PROJ 9.4.1; sf_use_s2() is TRUE
library(leaflet)      # For interactive maps
library(leaflet.extras) # For additional leaflet features
library(mapview)      # For easier map visualization
library(tmap)         # For thematic maps
library(tigris)       # For US road networks
## To enable caching of data, set `options(tigris_use_cache = TRUE)`
## in your R script or .Rprofile.
library(future)       # For parallel processing
library(future.apply) # For parallel processing with apply functions

# Create directories if they don't exist
if (!dir.exists("./data/tigris")) {
  dir.create("./data/tigris", recursive = TRUE)
}

# Set custom cache directory (optional)
options(tigris_cache_dir = "./data/tigris")
# Configure tigris to use caching
options(tigris_use_cache = TRUE)

Set-up convenience functions for parallel processing (as data is quite large)

# Function to create buffers in batches with proper projection
create_buffers_in_batches <- function(sf_object, buffer_dist, batch_size = 500) {
  # First, reproject to an appropriate projected CRS for the region
  # For California, UTM Zone 10N (EPSG:26910) or 11N (EPSG:26911) works well
  # If we're unsure about your region, a web mercator projection (3857) is a reasonable default
  message("Reprojecting data to a meter-based CRS...")
  sf_object_projected <- st_transform(sf_object, 3310)  # Web Mercator
  
  n_features <- nrow(sf_object_projected)
  n_batches <- ceiling(n_features / batch_size)
  
  # Create empty list to store results
  buffer_list <- vector("list", n_batches)
  
  # Process in batches
  for (i in 1:n_batches) {
    start_idx <- (i-1) * batch_size + 1
    end_idx <- min(i * batch_size, n_features)
    
    cat(sprintf("Processing batch %d of %d (features %d to %d)\n", 
                i, n_batches, start_idx, end_idx))
    
    # Extract batch
    batch <- sf_object_projected[start_idx:end_idx, ]
    
    # Create buffer (with parallel processing within sf)
    buffer_list[[i]] <- st_buffer(batch, dist = buffer_dist)
  }
  
  # Combine results
  result <- do.call(rbind, buffer_list)
  
  # Reproject back to original CRS if needed
  message("Reprojecting results back to original CRS...")
  st_transform(result, st_crs(sf_object))
}

Load data (filter for California for now to limit data set size)

# Efficient approach
df.acc <- fread("data/us_accidents/US_accidents_March23.csv")[
  # Filter date range to 2016-2021
  as.Date(Start_Time) >= as.Date("2016-01-01") & 
  as.Date(Start_Time) <= as.Date("2021-12-31") & 
  # And California
  State == "CA"
][, `:=`(
  # Add year, quarter, month columns
  year = data.table::year(Start_Time),
  quarter = data.table::quarter(Start_Time),
  month = data.table::month(Start_Time),
  # Calculate duration (assuming End_Time exists in the dataset)
  duration = as.numeric(difftime(End_Time, Start_Time, units = "days"))
)] %>% 
  as_tibble()  # Convert to tibble only at the end for performance
df.const <- fread("data/us_constructions/US_constructions_Dec21.csv")[
  # Filter date range to 2016-2021
  as.Date(Start_Time) >= as.Date("2016-01-01") & 
  as.Date(Start_Time) <= as.Date("2021-12-31") & 
    # And California
  State == "CA"
][, `:=`(
  # Add year, quarter, month columns
  year = year(Start_Time),
  quarter = quarter(Start_Time),
  month = month(Start_Time),
  # Calculate duration (assuming End_Time exists in the dataset)
  duration = as.numeric(difftime(End_Time, Start_Time, units = "days"))
)] %>% 
  as_tibble()  # Convert to tibble only at the end for performance

Convert dataframes to sf objects

# Convert accident data to sf object
df.acc.sf <- df.acc %>%
  filter(!is.na(Start_Lat) & !is.na(Start_Lng)) %>%
  st_as_sf(coords = c("Start_Lng", "Start_Lat"), crs = 4326)

# Convert construction data to sf object
df.const.sf <- df.const %>%
  filter(!is.na(Start_Lat) & !is.na(Start_Lng)) %>%
  st_as_sf(coords = c("Start_Lng", "Start_Lat"), crs = 4326)
# Get US roads (this can be slow for the entire US, so we might want to limit by state)

# First get all counties in California
ca_counties <- counties("CA", year = 2021)

# Then get roads for each county
ca_roads_list <- lapply(ca_counties$COUNTYFP, function(county_code) {
  roads("CA", county = county_code, year = 2021)
})

# Optionally combine all county road data into one object
ca_roads <- do.call(rbind, ca_roads_list)
# Static map for Accidents with year coloring
# First, store your plot in a variable
ca_plot <- ggplot() +
  geom_sf(data = ca_roads, color = "gray80", size = 0.1) +
  geom_sf(data = df.acc.sf %>% filter(State == "CA"), 
          aes(color = factor(year)), alpha = 0.7, size = 0.5) +
  scale_color_viridis_d(name = "Year") +  # Use viridis color palette for discrete values
  theme_minimal() +
  labs(title = "US Road Accidents by Year (2016-2021)") +
  theme(legend.position = "bottom")
print(ca_plot)

# Then save it using ggsave
#ggsave(filename = "imgs/california_accidents.png", 
#       plot = ca_plot,
#       width = 10, # width in inches
#       height = 8, # height in inches
#       dpi = 300)  # resolution

# Construction sites map with year coloring
# First, store our plot in a variable
ca_const_plot <- ggplot() +
  geom_sf(data = ca_roads, color = "gray80", size = 0.1) +
  geom_sf(data = df.const.sf %>% filter(State == "CA"), 
          aes(color = factor(year)), alpha = 0.7, size = 0.5) +
  scale_color_viridis_d(name = "Year") +  # Use viridis color palette for discrete values
  theme_minimal() +
  labs(title = "US Construction Sites by Year (2016-2021)") +
  theme(legend.position = "bottom")
print(ca_const_plot)

# Then save it using ggsave
#ggsave(filename = "imgs/california_construction.png", 
#       plot = ca_const_plot,
#       width = 10, # width in inches
#       height = 8, # height in inches
#       dpi = 300)  # resolution

Heatmaps - interactive

# For interactive maps (often better for large datasets)
# Accidents map
accident_map <- leaflet() %>%
  addTiles() %>%
  addHeatmap(data = df.acc.sf, 
             intensity = ~1,
             radius = 8, 
             blur = 10) %>%
  setView(lng = -119.4179, lat = 36.7783, zoom = 6) %>% # Center to CA
  setMaxBounds(lng1 = -124.6, lat1 = 42.0,    # Northwest corner of CA
               lng2 = -114.1, lat2 = 32.5)    # Southeast corner of CA

# Display maps
accident_map
# Construction sites map with California focus and bounds constraints
construction_map <- leaflet() %>%
  addTiles() %>%
  addHeatmap(data = df.const.sf, 
             intensity = ~1,
             radius = 8, 
             blur = 10,
             gradient = c("yellow", "orange", "red")) %>%
  setView(lng = -119.4179, lat = 36.7783, zoom = 6) %>%  # Center on California
  setMaxBounds(lng1 = -124.6, lat1 = 42.0,    # Northwest corner of CA
               lng2 = -114.1, lat2 = 32.5)    # Southeast corner of CA

# Display the map
construction_map